import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chisquare
from warnings import warn
from functools import lru_cache
from scipy.special import gammaln
from math import lgamma, log
from pgmpy.models import BayesianModel
from functools import partial
from IPython.display import Markdown, display
import networkx as nx
import pylab as plt
def printmd(string):
display(Markdown(string))
Commonly used scoring functions to measure the fit between model and data are Bayesian Dirichlet scores:
Scoring functions for learning Bayesian networks
# ref: https://pgmpy.org/index.html
class BaseEstimator(object):
def __init__(self, data=None, state_names=None, complete_samples_only=True):
self.data = data
if self.data is not None:
self.complete_samples_only = complete_samples_only
self.variables = list(data.columns.values)
if not isinstance(state_names, dict):
self.state_names = {
var: self._collect_state_names(var) for var in self.variables
}
else:
self.state_names = dict()
for var in self.variables:
if var in state_names:
if not set(self._collect_state_names(var)) <= set(
state_names[var]
):
raise ValueError(
f"Data contains unexpected states for variable: {var}."
)
self.state_names[var] = state_names[var]
else:
self.state_names[var] = self._collect_state_names(var)
def _collect_state_names(self, variable):
"Return a list of states that the variable takes in the data"
states = sorted(list(self.data.loc[:, variable].dropna().unique()))
return states
def convert_args_tuple(func):
def _convert_param_to_tuples(
obj, variable, parents=tuple(), complete_samples_only=None
):
parents = tuple(parents)
return func(obj, variable, parents, complete_samples_only)
return _convert_param_to_tuples
@convert_args_tuple
@lru_cache(maxsize=2048)
def state_counts(self, variable, parents=[], complete_samples_only=None):
"""
Return counts how often each state of 'variable' occurred in the data.
If a list of parents is provided, counting is done conditionally
for each state configuration of the parents.
Returns
-------
state_counts: pandas.DataFrame
Table with state counts for 'variable'
"""
parents = list(parents)
# default for how to deal with missing data can be set in class constructor
if complete_samples_only is None:
complete_samples_only = self.complete_samples_only
# ignores either any row containing NaN, or only those where the variable or its parents is NaN
data = (
self.data.dropna()
if complete_samples_only
else self.data.dropna(subset=[variable] + parents)
)
if not parents:
# count how often each state of 'variable' occured
state_count_data = data.loc[:, variable].value_counts()
state_counts = (
state_count_data.reindex(self.state_names[variable])
.fillna(0)
.to_frame()
)
else:
parents_states = [self.state_names[parent] for parent in parents]
# count how often each state of 'variable' occured, conditional on parents' states
state_count_data = (
data.groupby([variable] + parents).size().unstack(parents)
)
if not isinstance(state_count_data.columns, pd.MultiIndex):
state_count_data.columns = pd.MultiIndex.from_arrays(
[state_count_data.columns]
)
row_index = self.state_names[variable]
column_index = pd.MultiIndex.from_product(parents_states, names=parents)
state_counts = state_count_data.reindex(
index=row_index, columns=column_index
).fillna(0)
return state_counts
# Importing data
GME_stock_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/GME_stock.csv')
GME_stock_df[1:] = GME_stock_df[1:].round()
GME_stock_df = GME_stock_df[GME_stock_df.date>'2011-01-01'].round(1)
GME_stock_df['date'] = pd.to_datetime(GME_stock_df['date'])
GME_stock_df.set_index('date',inplace=True)
display(GME_stock_df.head())
# Plotting the open, close, high, low, volume and adjusted close value for the term of 12 months
fig = px.line(GME_stock_df, x=GME_stock_df.index, y=GME_stock_df.columns,
title='Plot of values for a 12 month period')
fig.update_xaxes(
dtick="M12",
tickformat="%b\n%Y")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
# Importing data Tatanic
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
class K2Score(BaseEstimator):
def __init__(self, data, **kwargs):
super(K2Score, self).__init__(data, **kwargs)
def K2_score_single(self, variable, parents, is_print=False):
var_states = self.state_names[variable]
var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
state_counts = self.state_counts(variable, parents)
num_parents_states = float(state_counts.shape[1])
counts = np.asarray(state_counts)
if is_print:
printmd(f'r_i = :{var_cardinality}')
printmd(f'q_i = :{num_parents_states}')
printmd('state_counts($N_{ijk}$):'+str(counts))
display(state_counts.head(5))
print('-'*100)
log_gamma_counts = np.zeros_like(counts, dtype=np.float_)
# Compute log(gamma(counts + 1))
gammaln(counts + 1, out=log_gamma_counts)
if is_print:
printmd("Let $\eta_{ijk} = 1$")
printmd("$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ = ")
print(f"log_gamma_counts = \n {log_gamma_counts}")
print('-'*100)
# Compute the log-gamma conditional sample size
log_gamma_conds = np.sum(counts, axis=0, dtype=np.float_)
gammaln(log_gamma_conds + var_cardinality, out=log_gamma_conds)
if is_print:
printmd("$N_{ij} = \sum_k(N_{ijk})$")
printmd("$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$")
printmd("$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ = ")
print(f"log_gamma_conds(-ve sign will be subtract in score computation) = \n {log_gamma_conds}")
print('-'*100)
# Compute the log P(G)
log_PG = num_parents_states * lgamma(var_cardinality)
if is_print:
print(f"num_parents_states = {num_parents_states}")
print(f"var_cardinality = {var_cardinality}")
printmd("$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $")
print(f"log(P(G)) = num_parents_states * lgamma(var_cardinality) = {log_PG}")
print('-'*100)
score = (
np.sum(log_gamma_counts)
- np.sum(log_gamma_conds)
+ num_parents_states * lgamma(var_cardinality)
)
if is_print:
print(f"np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:")
return score
def score(self, model,is_print=False):
score = 0
for node in model.nodes():
if is_print:
printmd(f"## node: {node}")
printmd(f"### parent: {list(model.predecessors(node))}")
printmd(f"score: {self.K2_score_single(node, model.predecessors(node),is_print=is_print)}")
print(f"="*100)
score += self.K2_score_single(node, model.predecessors(node))
return score
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
# making a BN
BN_1 = BayesianModel([['fare','age'], ['age','passengerclass'],['passengerclass','portembarked'], ['portembarked','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()
K2Score(simple_df).score(BN_1,is_print=True)
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()
K2Score(simple_df).score(BN_2,is_print=True)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
K2Score(GME_stock_df).score(BN_GMS,is_print=False)
class BDeuScore(BaseEstimator):
def __init__(self, data, equivalent_sample_size=10, **kwargs):
self.equivalent_sample_size = equivalent_sample_size
super(BDeuScore, self).__init__(data, **kwargs)
def BDeu_score_single(self, variable, parents, is_print=False):
var_states = self.state_names[variable]
var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
state_counts = self.state_counts(variable, parents)
num_parents_states = float(state_counts.shape[1])
counts = np.asarray(state_counts)
if is_print:
printmd(f'r_i = :{var_cardinality}')
printmd(f'q_i = :{num_parents_states}')
printmd('state_counts($N_{ijk}$):'+str(counts))
display(state_counts.head(2))
print('-'*100)
log_gamma_counts = np.zeros_like(counts, dtype=np.float_)
alpha = self.equivalent_sample_size / num_parents_states
beta = self.equivalent_sample_size / counts.size
if is_print:
printmd("##### Let $alpha = {N'}/q_{i}$ = " + str(alpha))
printmd("##### Let $ beta = {N'}/(r_{i}q_{i})$ = " + str(beta))
print('-'*100)
# Compute log(gamma(counts + beta))
gammaln(counts + beta, out=log_gamma_counts)
if is_print:
printmd("$log(\Gamma(N_{ijk} + beta))$ = ")
print(f"log_gamma_counts = \n {log_gamma_counts}")
print('-'*100)
# Compute the log-gamma conditional sample size
log_gamma_conds = np.sum(counts, axis=0, dtype=np.float_)
gammaln(log_gamma_conds + alpha, out=log_gamma_conds)
if is_print:
printmd("$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$")
print(f"log_gamma_conds = \n {log_gamma_conds}")
print('-'*100)
score = (
np.sum(log_gamma_counts)
- np.sum(log_gamma_conds)
+ num_parents_states * lgamma(alpha)
- counts.size * lgamma(beta)
)
if is_print:
print(f"np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):")
return score
def score(self, model,is_print=False):
score = 0
for node in model.nodes():
if is_print:
printmd(f"## node: {node}")
printmd(f"### parent: {list(model.predecessors(node))}")
printmd(f"score: {self.BDeu_score_single(node, model.predecessors(node),is_print=is_print)}")
print(f"="*100)
score += self.BDeu_score_single(node, model.predecessors(node))
return score
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
# making a BN
BN_1 = BayesianModel([['fare','age'], ['age','passengerclass'],['passengerclass','portembarked'], ['portembarked','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()
BDeuScore(simple_df, equivalent_sample_size=10).score(BN_1,is_print=True)
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()
BDeuScore(simple_df, equivalent_sample_size=10).score(BN_1,is_print=True)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
BDeuScore(GME_stock_df, equivalent_sample_size=20).score(BN_GMS,is_print=False)

* limiting the number of parents per network variable.
* penalization factor over LL score:
* **MDL/BIC**
* **AIC**
* **NML**
ref: Journal of Machine Learning Research - SCORING BAYESIAN NETWORKS USING MUTUAL INFORMATION AND INDEPENDENCE TESTS (2154~2155)
class BicScore(BaseEstimator):
def __init__(self, data, **kwargs):
super(BicScore, self).__init__(data, **kwargs)
def Bic_score_single(self, variable, parents, is_print=False):
var_states = self.state_names[variable]
var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
state_counts = self.state_counts(variable, parents)
num_parents_states = float(state_counts.shape[1])
counts = np.asarray(state_counts)
sample_size = len(self.data)
if is_print:
printmd(f'r_i = :{var_cardinality}')
printmd(f'q_i = :{num_parents_states}')
printmd('state_counts($N_{ijk}$):'+str(counts))
printmd('sample_size($N$):'+str(sample_size))
display(state_counts.head(2))
print('-'*100)
log_likelihoods = np.zeros_like(counts, dtype=np.float_)
# Compute the log-counts
np.log(counts, out=log_likelihoods, where=counts > 0)
if is_print:
printmd("$log(N_{ijk})(>0)$ = ")
print(f"{log_likelihoods}")
print('-'*100)
# Compute the log-conditional sample size
log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
if is_print:
printmd("$log(N_{ij})(>0)$")
print(f"{log_conditionals}")
print('-'*100)
# Compute the log-likelihoods
log_likelihoods -= log_conditionals
log_likelihoods *= counts
if is_print:
printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
print(f"{log_likelihoods}")
print('-'*100)
score = np.sum(log_likelihoods)
score -= 0.5 * log(sample_size) * num_parents_states * (var_cardinality - 1)
if is_print:
printmd("bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$")
print(f"{log_likelihoods}")
print('-'*100)
return score
def score(self, model,is_print=False):
score = 0
for node in model.nodes():
if is_print:
printmd(f"## node: {node}")
printmd(f"### parent: {list(model.predecessors(node))}")
printmd(f"score: {self.Bic_score_single(node, model.predecessors(node),is_print=is_print)}")
print(f"="*100)
score += self.Bic_score_single(node, model.predecessors(node))
return score
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
# making a BN
BN_1 = BayesianModel([['fare','passengerclass'], ['age','passengerclass'],['portembarked','passengerclass'], ['passengerclass','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()
BicScore(simple_df).score(BN_1,is_print=True)
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()
BicScore(simple_df).score(BN_2,is_print=True)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
BicScore(GME_stock_df).score(BN_GMS,is_print=False)
class AicScore(BaseEstimator):
def __init__(self, data, **kwargs):
super(AicScore, self).__init__(data, **kwargs)
def Aic_score_single(self, variable, parents, is_print=False):
var_states = self.state_names[variable]
var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
state_counts = self.state_counts(variable, parents)
num_parents_states = float(state_counts.shape[1])
counts = np.asarray(state_counts)
sample_size = len(self.data)
if is_print:
printmd(f'r_i = :{var_cardinality}')
printmd(f'q_i = :{num_parents_states}')
printmd('state_counts($N_{ijk}$):'+str(counts))
printmd('sample_size($N$):'+str(sample_size))
display(state_counts.head(2))
print('-'*100)
log_likelihoods = np.zeros_like(counts, dtype=np.float_)
# Compute the log-counts
np.log(counts, out=log_likelihoods, where=counts > 0)
if is_print:
printmd("$log(N_{ijk})(>0)$ = ")
print(f"{log_likelihoods}")
print('-'*100)
# Compute the log-conditional sample size
log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
if is_print:
printmd("$log(N_{ij})(>0)$")
print(f"{log_conditionals}")
print('-'*100)
# Compute the log-likelihoods
log_likelihoods -= log_conditionals
log_likelihoods *= counts
if is_print:
printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
print(f"{log_likelihoods}")
print('-'*100)
score = np.sum(log_likelihoods)
score -= num_parents_states * (var_cardinality - 1)
if is_print:
printmd("bic score = $\sum log\_likelihoods - num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$")
print(f"{log_likelihoods}")
print('-'*100)
return score
def score(self, model,is_print=False):
score = 0
for node in model.nodes():
if is_print:
printmd(f"## node: {node}")
printmd(f"### parent: {list(model.predecessors(node))}")
printmd(f"score: {self.Aic_score_single(node, model.predecessors(node),is_print=is_print)}")
print(f"="*100)
score += self.Aic_score_single(node, model.predecessors(node))
return score
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
AicScore(GME_stock_df).score(BN_GMS,is_print=False)
class fNMLScore(BaseEstimator):
def __init__(self, data, **kwargs):
super(fNMLScore, self).__init__(data, **kwargs)
def Szpankowski_approximation(self, row):
if row.n == 0: return 1
return 0.5*row.n*np.pi*np.exp(np.sqrt(8/(9*row.n*np.pi))+((3*np.pi-16)/(36*row.n*np.pi)))
def Compute_regret(self, row, r):
if row.n == 0: return 1
return row[f'C_{r-1}'] + (row.n/(r-2))*row[f'C_{r-2}']
def fNML_score_single(self, variable, parents, is_print=False):
var_states = self.state_names[variable]
var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
state_counts = self.state_counts(variable, parents)
num_parents_states = float(state_counts.shape[1])
counts = np.asarray(state_counts)
sample_size = len(self.data)
if is_print:
printmd(f'r_i = :{var_cardinality}')
printmd(f'q_i = :{num_parents_states}')
printmd('state_counts($N_{ijk}$):<br/>'+str(counts))
printmd('sample_size($N$):'+str(sample_size))
display(state_counts.head(2))
print('-'*100)
log_likelihoods = np.zeros_like(counts, dtype=np.float_)
# Compute the log-counts
np.log(counts, out=log_likelihoods, where=counts > 0)
if is_print:
printmd("$log(N_{ijk})(>0)$ = <br/>"+f"{log_likelihoods}")
print('-'*100)
# Compute the log-conditional sample size
log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
if is_print:
printmd("$log(N_{ij})(>0)$<br/>"+f"{log_conditionals}")
print('-'*100)
# Compute the log-likelihoods
log_likelihoods -= log_conditionals
log_likelihoods *= counts
if is_print:
printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
print(f"{log_likelihoods}")
print('-'*100)
# Compute the 𝐶𝑁(𝐵𝐺) table
N_ij = np.sum(counts, axis=0, dtype=np.float_)
N_ij_unique, N_ij_cnt = np.unique(N_ij, return_counts=True)
N_ij_dist = dict(zip(N_ij_unique, N_ij_cnt))
C_ini = [[n, 1] for n in N_ij_unique]
C_lookup_table = pd.DataFrame(C_ini, columns = ['n', 'C_1'])
if var_cardinality > 1:
C_lookup_table['C_2'] = C_lookup_table.apply(self.Szpankowski_approximation,axis=1)
if var_cardinality > 2:
for r_i in range(3,var_cardinality+1):
Compute_regret_par = partial(self.Compute_regret, r=r_i)
C_lookup_table[f'C_{r_i}'] = C_lookup_table.apply(Compute_regret_par,axis=1)
if r_i > 5: #only obtain the last 3 regret C for memory saving
C_lookup_table = C_lookup_table.drop(columns=[f'C_{r_i-3}'])
#C_lookup_table['N_ij_cnt'] = N_ij_cnt
C_lookup_table['fNML_regret'] = np.log(C_lookup_table[f'C_{var_cardinality}'])
fNML_regret = C_lookup_table.fNML_regret.values
if is_print:
print(f"N_ij distribution:\n {N_ij_dist}")
display(C_lookup_table)
print("fNML_regret = \n"+f"{fNML_regret}")
print('-'*100)
score = np.sum(log_likelihoods)
score -= np.sum(fNML_regret)
if is_print:
printmd("fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$")
print('-'*100)
return score
def score(self, model,is_print=False):
score = 0
for node in model.nodes():
if is_print:
printmd(f"## node: {node}")
printmd(f"### parent: {list(model.predecessors(node))}")
printmd(f"score: {self.fNML_score_single(node, model.predecessors(node),is_print=is_print)}")
print(f"="*100)
score += self.fNML_score_single(node, model.predecessors(node))
return score
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
# making a BN
BN_1 = BayesianModel([['fare','passengerclass'], ['age','passengerclass'],['portembarked','passengerclass'], ['passengerclass','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()
fNMLScore(simple_df).score(BN_1,is_print=True)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
fNMLScore(GME_stock_df).score(BN_GMS,is_print=False)
display(GME_stock_df.shape)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'],
['high_price','volume'],
['low_price','volume'],
['open_price','high_price'],
['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
sample_size_ls = [n for n in np.arange(10, 100, 25)] + [n for n in np.arange(100, len(GME_stock_df), 100)]
print(sample_size_ls)
score_rank = {'sample_size': sample_size_ls,
'K2Score':[],
'BDeuScore':[],
'BicScore':[],
'AicScore':[],
'fNMLScore':[],
}
for sample_size in sample_size_ls:
sample_data = GME_stock_df.sample(n=sample_size, random_state=1)
score_rank['K2Score'].append(K2Score(sample_data).score(BN_GMS,is_print=False))
score_rank['BDeuScore'].append(BDeuScore(sample_data,
equivalent_sample_size=sample_size/10).score(BN_GMS,is_print=False))
score_rank['BicScore'].append(BicScore(sample_data).score(BN_GMS,is_print=False))
score_rank['AicScore'].append(AicScore(sample_data).score(BN_GMS,is_print=False))
score_rank['fNMLScore'].append(fNMLScore(sample_data).score(BN_GMS,is_print=False))
score_rank_df = pd.DataFrame(score_rank)
display(score_rank_df.head())
df_melt = score_rank_df.melt(id_vars='sample_size', value_vars=score_rank_df.columns.to_list()[1:])
df_melt['log_score'] = np.log(-df_melt.value)
fig = px.line(df_melt, x='sample_size' , y='log_score', color='variable')
fig.show()
sample_size_ls = [n for n in np.arange(10, 100, 25)] + [n for n in np.arange(100, len(GME_stock_df), 100)]
print(sample_size_ls)
score_rank = {'sample_size': sample_size_ls,
'K2Score':[],
'BDeuScore':[],
'BicScore':[],
'AicScore':[],
'fNMLScore':[],
}
c=1
for sample_size in sample_size_ls:
sample_data = GME_stock_df.sample(n=sample_size, random_state=1)
score_rank['K2Score'].append(K2Score(sample_data).score(BN_GMS,is_print=False))
if sample_size < 200:
r = sample_size/5
else:
c+=1
r = sample_size/(50)*c
score_rank['BDeuScore'].append(BDeuScore(sample_data,
equivalent_sample_size=r).score(BN_GMS,is_print=False))
score_rank['BicScore'].append(BicScore(sample_data).score(BN_GMS,is_print=False))
score_rank['AicScore'].append(AicScore(sample_data).score(BN_GMS,is_print=False))
score_rank['fNMLScore'].append(fNMLScore(sample_data).score(BN_GMS,is_print=False))
score_rank_df = pd.DataFrame(score_rank)
display(score_rank_df.head())
df_melt = score_rank_df.melt(id_vars='sample_size', value_vars=score_rank_df.columns.to_list()[1:])
df_melt['log_score'] = np.log(-df_melt.value)
fig = px.line(df_melt, x='sample_size' , y='log_score', color='variable')
fig.show()